Introduction

In this project we use a dataset of wine reviews to predict review points from numerical, categorical and textual predictors.

The data is from Kaggle Datasets, and covers 150k wine reviews along with some attributes of the wines. It can be found here. (A (free) Kaggle login is required to access it directly from kaggle.com). The data was originally scraped from WineEnthusiast.

The dataset contains the following columns:

This is a particularly interesting problem for several reasons:

Methods

Feature Engineering

### Add 'continentTopFive' feature
wine = data.frame(wine, continentTopFive = getContinent_withTop5(wine$country))
#levels(wine$continentTopFive)
ggplot(wine, aes(x=log(price), y=points)) + 
  geom_point(aes(col = continentTopFive)) +stat_summary(fun.data=mean_cl_normal) + 
    stat_smooth(aes(colour=continentTopFive),method="lm",se = FALSE) + 
            scale_colour_manual(values = c("red","green", "blue", "orange", "dodger blue", "violet", "dark green")) + ggtitle("Points versus Log(price), by ContinentTopFive") + theme(plot.title = element_text(hjust = 0.5)) + labs(x="log(price)",y="points")

### Perform LDA and sentiment analysis on the description

corpus = tm::Corpus(tm::VectorSource(as.character(wine$description)))
## transform corpus: remove white space and punctuation, transform to lower case, remove stop words, stem.
corpus = tm::tm_map(corpus, stripWhitespace)
corpus = tm::tm_map(corpus, removePunctuation)
corpus = tm::tm_map(corpus, content_transformer(tolower))
corpus = tm::tm_map(corpus, removeWords, stopwords("english"))
# corpus = tm_map(corpus, removeWords, c("wine"))
corpus = tm::tm_map(corpus, stemDocument)
## create DocumentTerm matrix for LDA
corpus_dtm = DocumentTermMatrix(corpus)

NUM_TOPICS = 2

corpus_lda = topicmodels::LDA(corpus_dtm, k = NUM_TOPICS, control = list(seed = 1234))
corpus_documents = tidy(corpus_lda, matrix = "gamma")

SENTIMENT_ENGINE = "bing"
## Derive sentiment score
corpus_sentiments <- tidy(corpus_dtm) %>%
  inner_join(get_sentiments(SENTIMENT_ENGINE), by = c(term = "word")) %>%
  count(document, sentiment, wt = count) %>%
  ungroup() %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  arrange(sentiment)
## introduce new predictors
wine = data.frame(wine, topic1 = corpus_documents$gamma[1:nrow(wine)], document = rownames(wine))
wine = merge(wine, corpus_sentiments[,c(1,4)], by = "document")
wine = wine[complete.cases(wine[, c("points","price", "continentTopFive", "topic1", "sentiment")]), c("points","price", "continentTopFive", "topic1", "sentiment")]
names(wine)
## [1] "points"           "price"            "continentTopFive"
## [4] "topic1"           "sentiment"

Training and Test Set Generation

We shuffled our data, then created an 80% training, 20% test split for later use in cross validation. We did 10 fold cross validation of every model.

## REMOVE TEST SET = 20% of data.
#Randomly shuffle the data
wine = wine[sample(nrow(wine)),]

# Sample 80% to training, 20% to test set.
smp_size = floor(0.80 * nrow(wine))
train_ind = sample(seq_len(nrow(wine)), size = smp_size)
wine_train = wine[train_ind, ]
wine_test = wine[-train_ind, ]
# unit test the partitioning
nrow(wine) == nrow(wine_test) + nrow(wine_train)

Model Selection

Model #1

mod = lm(log(points-79.999) ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)
# plot(select)
# bptest(select)
# shapiro.test(sample(resid(select),5000))

# remove outliers and highly influential points and refit the *select*ed model
keep = which(abs(resid(select)) <= 7 & cooks.distance(select) <= 4 / nrow(wine_train))

select_formula1 = log(points - 79.999) ~ log(price) + continentTopFive + topic1 + 
    log(price):continentTopFive + continentTopFive:topic1
mod1 = lm(select_formula1, data=wine_train, subset=keep)

# Plot details and perform tests for homoscedasticity and normality of errors.
plot(mod1)

bptest(mod1)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod1
## BP = 2814.5, df = 20, p-value < 2.2e-16
shapiro.test(sample(resid(mod1),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod1), 5000)
## W = 0.91539, p-value < 2.2e-16
wine_train_cv = wine_train[keep,]

RMSE1 = cv_mean_rmse(select_formula1, wine_train_cv, 10, function(x) {exp(x) + 79.999} )

Model 2

mod = lm(points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)
# plot(select)
# bptest(select)
# shapiro.test(sample(resid(select),5000))

#remove highly influential points
keep = which(cooks.distance(select) <= 4 / nrow(wine_train))
select_formula2 = points ~ log(price) + continentTopFive + topic1 + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1
mod2 = lm(select_formula2, data=wine_train, subset=keep)
plot(mod2)

bptest(mod2)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod2
## BP = 5516.9, df = 21, p-value < 2.2e-16
shapiro.test(sample(resid(mod2),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod2), 5000)
## W = 0.998, p-value = 4.588e-06
wine_train_cv = wine_train[keep,]

RMSE2 = cv_mean_rmse(select_formula2, wine_train_cv, 10)

Model 3

bc_mod = caret::BoxCoxTrans(wine_train$points)

wine_train = cbind(wine_train, bc_points = predict(bc_mod, wine_train$points))
mod = lm(bc_points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)

keep = cooks.distance(mod) <= 4 / nrow(wine_train)
select_formula3 = bc_points ~ log(price) + continentTopFive + topic1 + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1

mod3 = lm(select_formula3, data=wine_train, subset = keep)
plot(mod3)

bptest(mod3)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 4528.8, df = 21, p-value < 2.2e-16
shapiro.test(sample(resid(mod3),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod3), 5000)
## W = 0.99359, p-value = 3.441e-14
### Perform 10 fold cross validation

#Randomly shuffle the training set and create 10 folds
wine_train_cv = wine_train[keep,]

# see section 13.1.2 at http://daviddalpiaz.github.io/appliedstats/transformations.html#response-transformation
RMSE3 = cv_mean_rmse(select_formula3, wine_train_cv, 10, function(x) {(x*bc_mod$lambda + 1)^(1/bc_mod$lambda)})

Model 4

mod = lm(points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train, subset = keep, weights = sqrt(1 / 1 + abs(points -90)))
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)
keep = cooks.distance(mod) <= 4 / nrow(wine_train)

select_formula4 = points ~ log(price) + continentTopFive + topic1 + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1
mod4 = lm(select_formula4, data=wine_train, subset = keep, weights = sqrt(1 / 1 + abs(points -90)))

plot(mod4)

bptest(mod4)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod4
## BP = 4449.3, df = 21, p-value < 2.2e-16
shapiro.test(sample(resid(mod4),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod4), 5000)
## W = 0.99787, p-value = 2.059e-06
### Perform 10 fold cross validation

#Randomly shuffle the training set and create 10 folds
wine_train_cv = wine_train[keep,]

RMSE4 = cv_mean_rmse(select_formula4, wine_train_cv, 10)

Model 5

mod = lm(points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3)  + I(log(price)^4) + I(log(price)^5)  + I(log(price)^6) + I(log(price)^7), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)
# plot(select)
# bptest(select)
# shapiro.test(sample(resid(select),5000))

#remove highly influential points
keep = which(cooks.distance(select) <= 4 / nrow(wine_train))

#update
select_formula5 = points ~ log(price) + continentTopFive + topic1 + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1
mod5 = lm(select_formula5, data=wine_train, subset=keep)
plot(mod2)

bptest(mod2)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod2
## BP = 5516.9, df = 21, p-value < 2.2e-16
shapiro.test(sample(resid(mod5),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod5), 5000)
## W = 0.9943, p-value = 3.436e-13
wine_train_cv = wine_train[keep,]

RMSE5 = cv_mean_rmse(select_formula5, wine_train_cv, 10)

Results

rmse_data = data.frame(rmse = c(RMSE1, RMSE2, RMSE3, RMSE4, RMSE5))
colnames(rmse_data) = c("CV RMSE")
rownames(rmse_data) = c("model1", "model2", "model3", "model4", "model5")

best_model = which.min(rmse_data$`CV RMSE`)
best_cv_rmse = round(rmse_data[best_model, 1],4)
eqn1 = paste("$\\hat{y} \\pm 2*", best_cv_rmse, "$", sep = "")
rmse_data
##         CV RMSE
## model1 2.567319
## model2 2.432905
## model3 2.449548
## model4 2.534950
## model5 2.437489
preds = predict(mod2, newdata=wine_test)
success_ratio = sum(wine_test$points >= preds - 2*best_cv_rmse & wine_test$points <= preds + 2*best_cv_rmse) / nrow(wine_test)

Discussion

We used four different strategies to attempt to meet the assumptions of multiple linear regression and minimize cross-validated RMSE. Attempts include transforming response with BoxCox and log, integrated polynomial models over the engineered features, and weighted least squares regression. In the end, the non-transformed response model had the lowest cross-validated RMSE with a value of 2.4329049.

Normality of the errors and homoskedasticity are suspect, as demonstrated by the Breusch-Pagan and Shapiro-Wilk tests. we tried many attempts to get normality and homoskedasticity (including additional less interesting attempts not shown here), but were not successful. We see that the variance seems highest for points around 90, and tapers to the extremes. However, because Shapiro-Wilk in R can take only a maximum of 5000 residuals, sampling is required, and this test is then dependent on the choice of random seed used in the sampling.

We get around the normal and homoskedastic assumptions by using cross-validated RMSE to create our prediction intervals. Doing so gives a 94% success rate on the withheld test set with a +/- 4.8 point prediction interval.

Potential for future improvements: This project could potentially be further expanded in the future by using deeper NLP, expanding with additional predictors, and expanded model selection attempts. It may also be the case that the data is noisy, with a high base \(\epsilon\) variance of points no matter which predictors are used. (subjective human judgement after all).

Appendix

About the LDA

corpus_topics = tidy(corpus_lda, matrix = "beta")
#corpus_topics
corpus_top_terms = corpus_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

corpus_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) + ggtitle("Word Probabilities for the 2 Topics") +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

beta_spread = corpus_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
#beta_spread

## Plot the beta spread of the topics
ggplot(beta_spread[order(abs(beta_spread$log_ratio)),][1:30,], aes(x=term, y=log_ratio)) + geom_bar(stat="identity", fill="green", width=.2) + coord_flip() + ggtitle("Log ratio of word probabilities in topic2/topic1")